For the reader: You can click the black “code” box on the right hand side to either see or hide the R code behind analysis results and graphs.
The results are arranged to separate tabs by site/area, so that it’s easy to look at just the results of the site of interest, instead of scrolling accidentally down to a different site and getting confused.
The results for each site consist of four parts: pXRF, particle size distribution, loss on ignition, and colorimetry.
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
# Libraries
library(openxlsx);
library(dplyr);
library(GGally); # GGpairs
library(ggfortify); # PCA autoplots
library(FactoMineR); # Fast PCA graphs
library(dendextend); # Dendrograms
# Read the raw data
pxrf <- read.xlsx("data/pxrf_AY.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Remove unused columns
pxrf <- pxrf %>%
select(-Site) %>%
select(-'Other;name') %>%
select(-'File;#') %>%
select(-DateTime) %>%
select(-Application) %>%
select(-Method) %>%
select(-ElapsedTime) %>%
select(-'Cal;Check')
# Coercing "< LOD" -results to NA:s
pxrf[4:93] <- pxrf[4:93] %>% mutate_if(is.character,as.numeric)
# Averaging results by "Sample", "Area" and "Type" columns
pxrf_averaged <- aggregate(pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Removing old (now empty) columns
pxrf_averaged <- pxrf_averaged %>% select(-Sample)
pxrf_averaged <- pxrf_averaged %>% select(-Area)
pxrf_averaged <- pxrf_averaged %>% select(-Type)
# Assigning sample names as new row names
rownames(pxrf_averaged) <- pxrf_averaged$Group.1
pxrf_averaged <- pxrf_averaged %>% select(-Group.1)
# Renaming the other two newly created groups back to "Type" and "Area"
names(pxrf_averaged)[names(pxrf_averaged) == "Group.2"] <- "Area"
names(pxrf_averaged)[names(pxrf_averaged) == "Group.3"] <- "Type"
# Removing error columns from the scaled analysis data set
pxrf_scaled <- pxrf_averaged %>% select(-ends_with("Err"))
# Scaling the data with standard z-score method
pxrf_scaled[3:47] <- scale(pxrf_scaled[3:47])
# Removing columns that only contain NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- pxrf_scaled %>% select_if(all_na)
pxrf_averaged: All averaged data including averaged errors (Uine’s note: for later use)
colnames(pxrf_averaged)
## [1] "Area" "Type" "MgO" "MgO;Err" "Al2O3" "Al2O3;Err"
## [7] "SiO2" "SiO2;Err" "P2O5" "P2O5;Err" "S" "S;Err"
## [13] "Cl" "Cl;Err" "K2O" "K2O;Err" "CaO" "CaO;Err"
## [19] "Ti" "Ti;Err" "V" "V;Err" "Cr" "Cr;Err"
## [25] "Mn" "Mn;Err" "Fe" "Fe;Err" "Co" "Co;Err"
## [31] "Ni" "Ni;Err" "Cu" "Cu;Err" "Zn" "Zn;Err"
## [37] "As" "As;Err" "Se" "Se;Err" "Rb" "Rb;Err"
## [43] "Sr" "Sr;Err" "Y" "Y;Err" "Zr" "Zr;Err"
## [49] "Nb" "Nb;Err" "Mo" "Mo;Err" "Rh" "Rh;Err"
## [55] "Pd" "Pd;Err" "Ag" "Ag;Err" "Cd" "Cd;Err"
## [61] "Sn" "Sn;Err" "Sb" "Sb;Err" "Ba" "Ba;Err"
## [67] "La" "La;Err" "Ce" "Ce;Err" "Hf" "Hf;Err"
## [73] "Ta" "Ta;Err" "W" "W;Err" "Pt" "Pt;Err"
## [79] "Au" "Au;Err" "Hg" "Hg;Err" "Tl" "Tl;Err"
## [85] "Pb" "Pb;Err" "Bi" "Bi;Err" "Th" "Th;Err"
## [91] "U" "U;Err"
pxrf_all: All elements, the numbers below are the amount of NA values
colSums(is.na(pxrf_all))
## Area Type MgO Al2O3 SiO2 P2O5 S Cl K2O CaO Ti V Mn
## 0 0 1 0 0 0 43 1 0 0 0 28 0
## Fe Ni Cu Zn As Rb Sr Y Zr Nb Ba
## 0 1 0 0 27 0 0 2 0 31 45
pxrf_final: Final analysis data set with only the selected elements
# Dropping the dubious elements and elements with too many NA values
pxrf_final <- pxrf_all %>%
select(-Al2O3) %>%
select(-CaO) %>%
select(-SiO2) %>%
select(-Cl) %>%
select(-K2O) %>%
select(-Ni) %>%
select(-P2O5) %>%
select(-S) %>%
select(-V) %>%
select(-As) %>%
select(-Nb) %>%
select(-Ba)
# Transforming remaining NA values to zeros for analysis
pxrf_final[is.na(pxrf_final)] <- 0
# Final analysis data set
colnames(pxrf_final)
## [1] "Area" "Type" "MgO" "Ti" "Mn" "Fe" "Cu" "Zn" "Rb" "Sr"
## [11] "Y" "Zr"
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_final[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_final[3:12], mapping = aes(col = cluster_x))
# PC autoplot with K-means clusters
set.seed(1)
autoplot(kmeans(pxrf_final[3:12], 7), data = pxrf_final, label = TRUE, label.size = 3)
PCA with final elements:
# Elements
colnames(pxrf_final[3:12])
## [1] "MgO" "Ti" "Mn" "Fe" "Cu" "Zn" "Rb" "Sr" "Y" "Zr"
# PCA with NA:s converted to zeros
pxrf_final[is.na(pxrf_final)] <- 0
pca_1 <- prcomp(pxrf_final[3:12])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3836 1.2233 1.0086 0.84957 0.60483 0.49903 0.44478
## Proportion of Variance 0.5719 0.1506 0.1024 0.07265 0.03682 0.02507 0.01991
## Cumulative Proportion 0.5719 0.7225 0.8249 0.89752 0.93434 0.95941 0.97932
## PC8 PC9 PC10
## Standard deviation 0.3538 0.22250 0.17534
## Proportion of Variance 0.0126 0.00498 0.00309
## Cumulative Proportion 0.9919 0.99691 1.00000
# PCA_1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam elements")
autoplot(pca_1, data=pxrf_final, colour='Area', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by area")
autoplot(pca_1, data=pxrf_final, colour='Type', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by sample type")
PCA(pxrf_final[3:12])
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 47 individuals, described by 10 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
# HCA dendrogram, samples color coded by area:
dend <-
pxrf_final %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
color_branches(k=7)
area <- as.factor(pxrf_final[, 1])
n_area <- length(unique(area))
cols_a <- colorspace::rainbow_hcl(n_area, c = 70, l = 50)
col_area <- cols_a[area]
labels_colors(dend) <- col_area[order.dendrogram(dend)]
plot(dend, main="HCA with sample areas")
legend("topright", legend = levels(area), fill = cols_a)
# HCA dendrogram, samples color coded by type:
dend2 <-
pxrf_final %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
color_branches(k=7)
type <- as.factor(pxrf_final[, 2])
n_type <- length(unique(type))
cols_t <- colorspace::rainbow_hcl(n_type, c = 70, l = 50)
col_type <- cols_t[type]
labels_colors(dend2) <- col_type[order.dendrogram(dend2)]
plot(dend2, main="HCA with sample types")
legend("topright", legend = levels(type), fill = cols_t)
# Libraries
library(dplyr);
library(openxlsx);
library(ggtern)
# Read and filter data
grain <- read.xlsx("data/grain_AY.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
grain <- grain %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average data by "Sample", "Context" and "Type" columns from the original dataset
grain <- aggregate(grain, by=list(grain$Sample, grain$Context, grain$Type), FUN=mean)
# Remove the now empty columns for clarity
grain <- grain %>% select(-Sample)
grain <- grain %>% select(-Context)
grain <- grain %>% select(-Type)
# Assign sample names ("Group.1") as row names, and rename the other created columns back to "Type" and "Area" for clarity
rownames(grain) <- grain$Group.1
grain <- grain %>% select(-Group.1)
names(grain)[names(grain) == "Group.2"] <- "Context"
names(grain)[names(grain) == "Group.3"] <- "Type"
# Scaling clay-silt-sand portions to values between 0 and 1
# (Otherwise the differences in low clay percentages are too small to differentiate)
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
# Apply function "normalize" to clay, silt and sand columns
grain_01 <- as.data.frame(apply(grain[4:6], 2, normalize))
# Ternary plots
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain_01)), size=3)
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=2)
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(openxlsx);
library(dplyr);
library(ggplot2);
library(ggrepel);
library(tidyr);
loi <- read.xlsx("data/loi_AY.xlsx", sep=";")
tga <- read.xlsx("data/tga_AY.xlsx", sep=";")
summary(loi)
## sample crucible wet_weight sample_wet
## Length:52 Min. :8.079 Min. :10.13 Min. :1.538
## Class :character 1st Qu.:8.566 1st Qu.:11.00 1st Qu.:2.464
## Mode :character Median :9.006 Median :11.71 Median :2.635
## Mean :8.994 Mean :11.55 Mean :2.559
## 3rd Qu.:9.435 3rd Qu.:12.05 3rd Qu.:2.824
## Max. :9.925 Max. :12.68 Max. :3.043
## dry_weight after_550_C after_950_C context
## Min. :10.11 Min. :10.08 Min. :10.05 Length:52
## 1st Qu.:10.97 1st Qu.:10.94 1st Qu.:10.86 Class :character
## Median :11.67 Median :11.62 Median :11.53 Mode :character
## Mean :11.52 Mean :11.47 Mean :11.41
## 3rd Qu.:12.00 3rd Qu.:11.96 3rd Qu.:11.93
## Max. :12.65 Max. :12.60 Max. :12.52
## type
## Length:52
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:53 Length:53 Length:53 Length:53
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Dry Mass_550 Mass_950 context
## Length:53 Length:53 Length:53 Length:53
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## type
## Length:53
## Class :character
## Mode :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=loi)
boxplot(c950~context, data=loi)
# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum.
boxplot.stats(loi$c950)
## $stats
## [1] 0.398150 1.528018 2.198513 3.162314 4.889189
##
## $n
## [1] 52
##
## $conf
## [1] 1.840428 2.556598
##
## $out
## [1] 8.041755 11.960444
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)
# Removing duplicates from the next graph
rownames(loi)
## [1] "AY-1" "AY-2" "AY-3" "AY-4" "AY-5" "AY-6" "AY-7"
## [8] "AY-8" "AY-9" "AY-10" "AY-11" "AY-12" "AY-13" "AY-14"
## [15] "AY-15" "AY-16" "AY-17" "AY-18" "AY-19" "AY-20" "AY-21"
## [22] "AY-22" "AY-23" "AY-24" "AY-25" "AY-26" "AY-27" "AY-28"
## [29] "AY-29" "AY-30" "AY-31" "AY-32" "AY-33" "AY-34" "AY-35"
## [36] "AY-36" "AY-37" "AY-38" "AY-39" "AY-40" "AY-41" "AY-42"
## [43] "AY-50" "AY-51" "AY-52" "AY-53" "AY-54" "AY-50_2" "AY-51_2"
## [50] "AY-52_2" "AY-53_2" "AY-54_2"
loi <- subset(loi[1:47, ])
# Color by context
ggplot(loi,
aes(c550, c950, label = rownames(loi), colour = factor(context))) +
geom_point(size=2, aes(shape = factor(type))) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Color by type
ggplot(loi,
aes(c550, c950)) +
geom_point(size=2, aes(shape = factor(type), colour = factor(context))) +
labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=tga)
boxplot(c950~context, data=tga)
# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum.
boxplot.stats(tga$c950)
## $stats
## [1] 0.7233976 1.5855926 2.3958197 3.4621816 5.4590326
##
## $n
## [1] 50
##
## $conf
## [1] 1.976504 2.815135
##
## $out
## [1] 7.994718
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])
ggplot(tga,
aes(c550, c950, label = rownames(tga), colour = factor(context))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
barplot(tga$c550)
barplot(tga$c950)
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_AY.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :37.56 Min. : 7.97 Min. :16.81
## 1st Qu.:42.13 1st Qu.: 9.72 1st Qu.:19.76
## Median :43.64 Median :10.38 Median :21.18
## Mean :43.92 Mean :10.49 Mean :21.22
## 3rd Qu.:45.91 3rd Qu.:11.29 3rd Qu.:22.34
## Max. :48.24 Max. :13.53 Max. :25.03
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color (a*b)") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "A versus luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "B versus luminance") +
ylab("L* (D65)") +
xlab("b")
# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L,
xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$a, color$b, color$L, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
scatter3D(color$L, color$a, color$b,
xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
# Libraries
library(openxlsx);
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
# Read the raw data
pxrf <- read.xlsx("data/pxrf_AYB.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2",
"K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
summary(pxrf_no_na)
## Area Type Al2O3 SiO2
## Byzantine:7 ceiling:3 Min. :-0.6649899 Min. :-0.7149
## floor :1 1st Qu.:-0.5260944 1st Qu.:-0.5619
## PEM :3 Median :-0.3729803 Median :-0.3191
## Mean : 0.0000000 Mean : 0.0000
## 3rd Qu.:-0.0009942 3rd Qu.: 0.1869
## Max. : 2.0921476 Max. : 1.7841
## CaO Ti Mn Fe
## Min. :-1.6463 Min. :-0.6002745 Min. :-0.5364 Min. :-0.9003
## 1st Qu.:-0.5655 1st Qu.:-0.4813714 1st Qu.:-0.4011 1st Qu.:-0.6185
## Median : 0.3098 Median :-0.3873934 Median :-0.3370 Median :-0.4877
## Mean : 0.0000 Mean : 0.0000000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6541 3rd Qu.: 0.0005995 3rd Qu.:-0.1593 3rd Qu.: 0.5536
## Max. : 1.1593 Max. : 1.9492115 Max. : 1.9942 Max. : 1.5177
## Ni Cu Zn Sr
## Min. :-0.9780 Min. :-1.3184 Min. :-0.7633 Min. :-0.7906
## 1st Qu.:-0.4890 1st Qu.:-0.2432 1st Qu.:-0.5817 1st Qu.:-0.3883
## Median :-0.2574 Median :-0.1984 Median :-0.1408 Median :-0.2674
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3089 3rd Qu.: 0.1600 3rd Qu.: 0.3261 3rd Qu.: 0.4585
## Max. : 1.5957 Max. : 1.6832 Max. : 1.4154 Max. : 0.9176
## Zr
## Min. :-0.69215
## 1st Qu.:-0.54451
## Median :-0.24621
## Mean : 0.00000
## 3rd Qu.: 0.02497
## Max. : 1.97745
pxrf_all:
summary(pxrf_all)
## Area Type MgO Al2O3
## Byzantine:7 ceiling:3 Min. :-0.9836 Min. :-0.6649899
## floor :1 1st Qu.:-0.4419 1st Qu.:-0.5260944
## PEM :3 Median :-0.2397 Median :-0.3729803
## Mean :-0.1222 Mean : 0.0000000
## 3rd Qu.: 0.2820 3rd Qu.:-0.0009942
## Max. : 0.7831 Max. : 2.0921476
## NA's :1
## SiO2 K2O CaO Ti
## Min. :-0.7149 Min. :-1.03235 Min. :-1.6463 Min. :-0.6002745
## 1st Qu.:-0.5619 1st Qu.:-0.54671 1st Qu.:-0.5655 1st Qu.:-0.4813714
## Median :-0.3191 Median :-0.25114 Median : 0.3098 Median :-0.3873934
## Mean : 0.0000 Mean : 0.06213 Mean : 0.0000 Mean : 0.0000000
## 3rd Qu.: 0.1869 3rd Qu.: 0.29801 3rd Qu.: 0.6541 3rd Qu.: 0.0005995
## Max. : 1.7841 Max. : 1.84282 Max. : 1.1593 Max. : 1.9492115
## NA's :2
## Mn Fe Ni Cu
## Min. :-0.5364 Min. :-0.9003 Min. :-0.9780 Min. :-1.3184
## 1st Qu.:-0.4011 1st Qu.:-0.6185 1st Qu.:-0.4890 1st Qu.:-0.2432
## Median :-0.3370 Median :-0.4877 Median :-0.2574 Median :-0.1984
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1593 3rd Qu.: 0.5536 3rd Qu.: 0.3089 3rd Qu.: 0.1600
## Max. : 1.9942 Max. : 1.5177 Max. : 1.5957 Max. : 1.6832
##
## Zn As Rb Sr
## Min. :-0.7633 Min. :-0.88791 Min. :-0.83065 Min. :-0.7906
## 1st Qu.:-0.5817 1st Qu.:-0.56643 1st Qu.:-0.55508 1st Qu.:-0.3883
## Median :-0.1408 Median :-0.03062 Median :-0.12204 Median :-0.2674
## Mean : 0.0000 Mean : 0.09185 Mean :-0.08267 Mean : 0.0000
## 3rd Qu.: 0.3261 3rd Qu.: 0.82668 3rd Qu.: 0.25195 3rd Qu.: 0.4585
## Max. : 1.4154 Max. : 1.13285 Max. : 0.90151 Max. : 0.9176
## NA's :1 NA's :1
## Y Zr Nb Ag
## Min. :-0.82699 Min. :-0.69215 Min. :-0.2048 Min. :0
## 1st Qu.:-0.64944 1st Qu.:-0.54451 1st Qu.:-0.2048 1st Qu.:0
## Median :-0.29435 Median :-0.24621 Median :-0.2048 Median :0
## Mean : 0.06074 Mean : 0.00000 Mean :-0.2048 Mean :0
## 3rd Qu.: 0.59337 3rd Qu.: 0.02497 3rd Qu.:-0.2048 3rd Qu.:0
## Max. : 1.48110 Max. : 1.97745 Max. :-0.2048 Max. :0
## NA's :2 NA's :6 NA's :6
## Pb
## Min. :-0.00135
## 1st Qu.: 0.29028
## Median : 0.39289
## Mean : 0.40100
## 3rd Qu.: 0.50361
## Max. : 0.81954
## NA's :3
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Optimal number of clusters in K-means analysis: 2
k_max <- 6
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:11], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
PCA with only the no-NA:s elements:
# PCA
pca_1 <- prcomp(pxrf_1[3:11])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2445 0.8845 0.75353 0.40411 0.30308 0.06678 1.703e-16
## Proportion of Variance 0.7578 0.1177 0.08542 0.02457 0.01382 0.00067 0.000e+00
## Cumulative Proportion 0.7578 0.8755 0.96094 0.98551 0.99933 1.00000 1.000e+00
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA grouped by area")
PCA with all the feasible elements:
# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:21])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6967 1.4173 1.03547 0.69371 0.65343 0.3604 2.873e-16
## Proportion of Variance 0.6384 0.1763 0.09413 0.04225 0.03748 0.0114 0.000e+00
## Cumulative Proportion 0.6384 0.8147 0.90887 0.95111 0.98860 1.0000 1.000e+00
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")
autoplot(pca_2, data=pxrf_all, colour='Type', shape = FALSE, label = TRUE, main = "PCA more elements grouped by area")
# HCA
# HCA dendrogam 1
dend <-
pxrf_1[3:11] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=4) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=4) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward4 <- hclust(dist(pxrf_1[3:11], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=4)
# Libraries
library(dplyr);
library(openxlsx);
library(ggtern)
# Read and filter data
grain <- read.xlsx("data/grain_AYB.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
grain <- grain %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average data by "Sample", "Context" and "Type" columns from the original dataset
grain <- aggregate(grain, by=list(grain$Sample, grain$Context, grain$Type), FUN=mean)
# Remove the now empty columns for clarity
grain <- grain %>% select(-Sample)
grain <- grain %>% select(-Context)
grain <- grain %>% select(-Type)
# Assign sample names ("Group.1") as row names, and rename the other created columns back to "Type" and "Area" for clarity
rownames(grain) <- grain$Group.1
grain <- grain %>% select(-Group.1)
names(grain)[names(grain) == "Group.2"] <- "Context"
names(grain)[names(grain) == "Group.3"] <- "Type"
# Scaling clay-silt-sand portions to values between 0 and 1
# (Otherwise the differences in low clay percentages are too small to differentiate)
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
# Apply function "normalize" to clay, silt and sand columns
grain_01 <- as.data.frame(apply(grain[4:6], 2, normalize))
# Ternary plots
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain_01)), size=3)
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(openxlsx);
library(dplyr);
library(ggplot2);
library(ggrepel);
library(tidyr);
loi <- read.xlsx("data/loi_AYB.xlsx", sep=";")
tga <- read.xlsx("data/tga_AYB.xlsx", sep=";")
summary(loi)
## sample crucible wet_weight sample_wet
## Length:7 Min. :8.382 Min. :10.92 Min. :2.257
## Class :character 1st Qu.:8.555 1st Qu.:11.39 1st Qu.:2.613
## Mode :character Median :8.666 Median :11.50 Median :2.880
## Mean :8.943 Mean :11.68 Mean :2.739
## 3rd Qu.:9.442 3rd Qu.:12.13 3rd Qu.:2.905
## Max. :9.560 Max. :12.31 Max. :3.002
## dry_weight after_550_C after_950_C context
## Min. :10.83 Min. :10.62 Min. :10.50 Length:7
## 1st Qu.:11.33 1st Qu.:11.25 1st Qu.:11.04 Class :character
## Median :11.45 Median :11.39 Median :11.21 Mode :character
## Mean :11.63 Mean :11.55 Mean :11.24
## 3rd Qu.:12.11 3rd Qu.:12.05 3rd Qu.:11.43
## Max. :12.26 Max. :12.24 Max. :12.06
## type
## Length:7
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:7 Min. :44536 Length:7 Length:7
## Class :character 1st Qu.:44536 Class :character Class :character
## Mode :character Median :44536 Mode :character Mode :character
## Mean :44536
## 3rd Qu.:44536
## Max. :44536
## Dry Mass_550 Mass_950 context
## Length:7 Length:7 Length:7 Length:7
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## type
## Length:7
## Class :character
## Mode :character
##
##
##
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)
# Color by type
ggplot(loi,
aes(c550, c950)) +
geom_point(size=2, aes(shape = factor(context), colour = factor(type))) +
labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)
# Color by type
ggplot(tga,
aes(c550, c950, label = rownames(tga), colour = factor(type))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_AYB.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :34.56 Min. : 2.480 Min. : 6.98
## 1st Qu.:38.93 1st Qu.: 5.215 1st Qu.:12.73
## Median :45.30 Median : 5.540 Median :13.28
## Mean :46.69 Mean : 6.211 Mean :14.56
## 3rd Qu.:50.95 3rd Qu.: 6.675 3rd Qu.:16.21
## Max. :67.20 Max. :11.680 Max. :23.75
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color (a*b)") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "A versus luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "B versus luminance") +
ylab("L* (D65)") +
xlab("b")
# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L,
xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$a, color$b, color$L, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
scatter3D(color$L, color$a, color$b,
xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
# Libraries
library(openxlsx);
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
# Read the raw data
pxrf <- read.xlsx("data/pxrf_israel.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2",
"K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
summary(pxrf_no_na)
## Area Type MgO Al2O3
## Ad-Halom :9 floor : 2 Min. :-1.2243 Min. :-1.1535
## Rishon Le'Zion:1 floor plaster: 1 1st Qu.:-0.3655 1st Qu.:-0.7296
## Tell Iztabba :9 lime plaster : 4 Median : 0.3253 Median :-0.1502
## mudbrick :12 Mean : 0.2570 Mean :-0.1498
## 3rd Qu.: 0.7669 3rd Qu.: 0.3186
## Max. : 2.2752 Max. : 1.0340
## CaO Ti Mn Fe
## Min. :-1.1324 Min. :-1.62849 Min. :-1.1564 Min. :-1.21031
## 1st Qu.:-0.3895 1st Qu.:-0.59656 1st Qu.:-0.7510 1st Qu.:-0.75649
## Median : 0.2485 Median :-0.02908 Median :-0.1021 Median : 0.09126
## Mean : 0.1365 Mean :-0.17581 Mean :-0.1364 Mean :-0.06331
## 3rd Qu.: 0.6235 3rd Qu.: 0.38876 3rd Qu.: 0.2178 3rd Qu.: 0.47399
## Max. : 2.0327 Max. : 1.15358 Max. : 1.4046 Max. : 1.24389
## Cu Zn Sr Zr
## Min. :-1.23132 Min. :-1.47722 Min. :-1.1311 Min. :-1.9036
## 1st Qu.:-0.40202 1st Qu.:-0.56202 1st Qu.:-0.5905 1st Qu.:-1.2329
## Median : 0.06850 Median :-0.11447 Median :-0.2214 Median :-0.1911
## Mean : 0.03885 Mean : 0.07566 Mean :-0.1056 Mean :-0.3629
## 3rd Qu.: 0.38611 3rd Qu.: 0.91136 3rd Qu.: 0.3090 3rd Qu.: 0.4183
## Max. : 1.19776 Max. : 1.39913 Max. : 1.7253 Max. : 1.4929
pxrf_all:
summary(pxrf_all)
## Area Type MgO Al2O3
## Ad-Halom :9 floor : 2 Min. :-1.2243 Min. :-1.1535
## Rishon Le'Zion:1 floor plaster: 1 1st Qu.:-0.3655 1st Qu.:-0.7296
## Tell Iztabba :9 lime plaster : 4 Median : 0.3253 Median :-0.1502
## mudbrick :12 Mean : 0.2570 Mean :-0.1498
## 3rd Qu.: 0.7669 3rd Qu.: 0.3186
## Max. : 2.2752 Max. : 1.0340
##
## SiO2 K2O CaO Ti
## Min. :-1.63885 Min. :-0.379762 Min. :-1.1324 Min. :-1.62849
## 1st Qu.:-0.60735 1st Qu.: 0.007179 1st Qu.:-0.3895 1st Qu.:-0.59656
## Median :-0.05503 Median : 0.329753 Median : 0.2485 Median :-0.02908
## Mean :-0.11085 Mean : 0.326436 Mean : 0.1365 Mean :-0.17581
## 3rd Qu.: 0.33797 3rd Qu.: 0.477998 3rd Qu.: 0.6235 3rd Qu.: 0.38876
## Max. : 1.39686 Max. : 1.190872 Max. : 2.0327 Max. : 1.15358
## NA's :1 NA's :8
## Mn Fe Ni Cu
## Min. :-1.1564 Min. :-1.21031 Min. :-0.85215 Min. :-1.23132
## 1st Qu.:-0.7510 1st Qu.:-0.75649 1st Qu.:-0.40208 1st Qu.:-0.40202
## Median :-0.1021 Median : 0.09126 Median :-0.18228 Median : 0.06850
## Mean :-0.1364 Mean :-0.06331 Mean : 0.01659 Mean : 0.03885
## 3rd Qu.: 0.2178 3rd Qu.: 0.47399 3rd Qu.: 0.26255 3rd Qu.: 0.38611
## Max. : 1.4046 Max. : 1.24389 Max. : 1.17838 Max. : 1.19776
## NA's :11
## Zn Rb Sr Y
## Min. :-1.47722 Min. :-0.9609 Min. :-1.1311 Min. :-0.85071
## 1st Qu.:-0.56202 1st Qu.:-0.3269 1st Qu.:-0.5905 1st Qu.:-0.47216
## Median :-0.11447 Median : 0.3071 Median :-0.2214 Median :-0.09361
## Mean : 0.07566 Mean : 0.2332 Mean :-0.1056 Mean :-0.30009
## 3rd Qu.: 0.91136 3rd Qu.: 0.6637 3rd Qu.: 0.3090 3rd Qu.:-0.02478
## Max. : 1.39913 Max. : 1.4364 Max. : 1.7253 Max. : 0.04405
## NA's :6 NA's :16
## Zr Nb Ag
## Min. :-1.9036 Min. :-0.1096 Min. :0.07923
## 1st Qu.:-1.2329 1st Qu.: 0.2421 1st Qu.:0.07923
## Median :-0.1911 Median : 0.5937 Median :0.07923
## Mean :-0.3629 Mean : 0.5937 Mean :0.07923
## 3rd Qu.: 0.4183 3rd Qu.: 0.9454 3rd Qu.:0.07923
## Max. : 1.4929 Max. : 1.2971 Max. :0.07923
## NA's :17 NA's :18
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:10], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:10], centers = 4)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements:
# PCA
pca_1 <- prcomp(pxrf_1[3:10])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7341 1.1365 0.8140 0.51966 0.42169 0.2369 0.15638
## Proportion of Variance 0.5463 0.2346 0.1204 0.04906 0.03231 0.0102 0.00444
## Cumulative Proportion 0.5463 0.7809 0.9013 0.95038 0.98268 0.9929 0.99732
## PC8
## Standard deviation 0.12142
## Proportion of Variance 0.00268
## Cumulative Proportion 1.00000
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA grouped by area")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA grouped by type")
PCA with all the feasible elements:
# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:19])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0585 1.2500 0.8709 0.73570 0.63683 0.41024 0.36958
## Proportion of Variance 0.5252 0.1937 0.0940 0.06709 0.05027 0.02086 0.01693
## Cumulative Proportion 0.5252 0.7189 0.8129 0.88000 0.93027 0.95113 0.96806
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.26815 0.23622 0.2126 0.18137 0.17097 0.09826 0.07320
## Proportion of Variance 0.00891 0.00692 0.0056 0.00408 0.00362 0.00120 0.00066
## Cumulative Proportion 0.97698 0.98389 0.9895 0.99357 0.99720 0.99839 0.99906
## PC15 PC16 PC17
## Standard deviation 0.07015 0.05176 0.0002973
## Proportion of Variance 0.00061 0.00033 0.0000000
## Cumulative Proportion 0.99967 1.00000 1.0000000
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")
autoplot(pca_2, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA more elements grouped by area")
autoplot(pca_2, data=pxrf_all, colour='Type', shape = FALSE, label = TRUE, main = "PCA more elements grouped by type")
# HCA dendrogam 1
dend <-
pxrf_1[3:10] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=4) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=6) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward4 <- hclust(dist(pxrf_1[3:10], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=6)
# Libraries
library(dplyr);
library(openxlsx);
library(ggtern)
# Read and filter data
grain <- read.xlsx("data/grain_israel.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
grain <- grain %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average data by "Sample", "Context" and "Type" columns from the original dataset
grain <- aggregate(grain, by=list(grain$Sample, grain$Context, grain$Type), FUN=mean)
# Remove the now empty columns for clarity
grain <- grain %>% select(-Sample)
grain <- grain %>% select(-Context)
grain <- grain %>% select(-Type)
# Assign sample names ("Group.1") as row names, and rename the other created columns back to "Type" and "Area" for clarity
rownames(grain) <- grain$Group.1
grain <- grain %>% select(-Group.1)
names(grain)[names(grain) == "Group.2"] <- "Context"
names(grain)[names(grain) == "Group.3"] <- "Type"
# Scaling clay-silt-sand portions to values between 0 and 1
# (Otherwise the differences in low clay percentages are too small to differentiate)
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
# Apply function "normalize" to clay, silt and sand columns
grain_01 <- as.data.frame(apply(grain[4:6], 2, normalize))
# Ternary plots
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain_01)), size=3)
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(openxlsx);
library(dplyr);
library(ggplot2);
library(ggrepel);
library(tidyr);
loi <- read.xlsx("data/loi_israel.xlsx", sep=";")
tga <- read.xlsx("data/tga_israel.xlsx", sep=";")
summary(loi)
## sample crucible wet_weight sample_wet
## Length:19 Min. :7.921 Min. :10.55 Min. :2.081
## Class :character 1st Qu.:8.557 1st Qu.:11.23 1st Qu.:2.403
## Mode :character Median :9.278 Median :11.78 Median :2.584
## Mean :9.017 Mean :11.63 Mean :2.611
## 3rd Qu.:9.452 3rd Qu.:11.92 3rd Qu.:2.828
## Max. :9.787 Max. :12.62 Max. :3.231
## dry_weight after_550_C after_950_C context
## Min. :10.52 Min. :10.42 Min. : 9.773 Length:19
## 1st Qu.:11.23 1st Qu.:11.18 1st Qu.:10.376 Class :character
## Median :11.76 Median :11.70 Median :10.971 Mode :character
## Mean :11.60 Mean :11.53 Mean :10.884
## 3rd Qu.:11.88 3rd Qu.:11.78 3rd Qu.:11.322
## Max. :12.56 Max. :12.49 Max. :12.371
## type
## Length:19
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:12 Length:12 Length:12 Length:12
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Dry Mass_550 Mass_950 context
## Length:12 Length:12 Length:12 Length:12
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## type
## Length:12
## Class :character
## Mode :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=loi)
boxplot(c950~context, data=loi)
# Color by context
ggplot(loi,
aes(c550, c950, label = rownames(loi), colour = factor(context))) +
geom_point(size=2, aes(shape = factor(type))) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Color by type
ggplot(loi,
aes(c550, c950)) +
geom_point(size=2, aes(shape = factor(context), colour = factor(type))) +
labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=tga)
boxplot(c950~context, data=tga)
# Color by context
ggplot(tga,
aes(c550, c950, label = rownames(tga), colour = factor(context))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_israel.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :43.75 Min. : 3.160 Min. :13.85
## 1st Qu.:56.08 1st Qu.: 3.770 1st Qu.:14.71
## Median :64.30 Median : 4.350 Median :15.85
## Mean :63.19 Mean : 6.649 Mean :17.27
## 3rd Qu.:71.51 3rd Qu.: 6.280 3rd Qu.:17.70
## Max. :75.34 Max. :19.530 Max. :25.84
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color (a*b)") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "A versus luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "B versus luminance") +
ylab("L* (D65)") +
xlab("b")
# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L,
xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$a, color$b, color$L, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
scatter3D(color$L, color$a, color$b,
xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(FactoMineR); # fast PCA graphs
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(tidyverse) # rectangles around HClust
# Read data
pxrf <- read.xlsx("data/pxrf_PP.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2",
"S", "Cl", "K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(pxrf_NA, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
# Datasets with only our new mudbrick samples (no soil or previously published ones)
MB_nona <- pxrf_no_na[c(1:11), ]
MB_all <- pxrf_all[c(1:11), ]
# Removing As, Nb, Rh, Ag and Pb as they have almost no viable measurement values
MB_all <- MB_all %>%
select(-As) %>%
select(-Nb) %>%
select(-Rh) %>%
select(-Ag) %>%
select(-Pb)
pxrf_no_na:
summary(pxrf_no_na)
## Area Type Ti Mn
## old :34 mudbrick :11 Min. :0.0969 Min. :0.01190
## LA54:4 : 5 mudbrick (old):34 1st Qu.:0.1814 1st Qu.:0.03893
## LA59:2 : 5 soil : 8 Median :0.2118 Median :0.04670
## Laona soil l1: 2 Mean :0.2180 Mean :0.04549
## Laona soil l2: 2 3rd Qu.:0.2594 3rd Qu.:0.05440
## Laona soil l3: 2 Max. :0.3571 Max. :0.06800
## (Other) : 3
## Fe Ni Cu Zr
## Min. :0.9617 Min. :0.000400 Min. :0.003600 Min. :0.004300
## 1st Qu.:2.2241 1st Qu.:0.002200 1st Qu.:0.005933 1st Qu.:0.007800
## Median :2.7214 Median :0.003000 Median :0.006500 Median :0.008800
## Mean :2.8091 Mean :0.002856 Mean :0.006761 Mean :0.008810
## 3rd Qu.:3.3530 3rd Qu.:0.003500 3rd Qu.:0.007400 3rd Qu.:0.009667
## Max. :4.7754 Max. :0.005900 Max. :0.011333 Max. :0.012200
##
pxrf_all:
summary(pxrf_all)
## Area Type MgO Al2O3
## old :34 mudbrick :11 Min. :2.919 Min. :2.182
## LA54:4 : 5 mudbrick (old):34 1st Qu.:3.369 1st Qu.:3.403
## LA59:2 : 5 soil : 8 Median :4.088 Median :4.317
## Laona soil l1: 2 Mean :4.011 Mean :4.168
## Laona soil l2: 2 3rd Qu.:4.499 3rd Qu.:4.702
## Laona soil l3: 2 Max. :5.125 Max. :5.902
## (Other) : 3 NA's :34 NA's :34
## SiO2 S Cl K2O
## Min. : 9.164 Min. :0.04373 Min. :0.01113 Min. :0.1980
## 1st Qu.:14.598 1st Qu.:0.10686 1st Qu.:0.02597 1st Qu.:0.3599
## Median :17.087 Median :0.21532 Median :0.05117 Median :0.4238
## Mean :16.502 Mean :0.25538 Mean :0.13135 Mean :0.4340
## 3rd Qu.:18.560 3rd Qu.:0.32239 3rd Qu.:0.10892 3rd Qu.:0.4795
## Max. :22.259 Max. :0.74597 Max. :0.93440 Max. :0.6584
## NA's :34 NA's :35 NA's :34 NA's :36
## CaO Ti V Mn
## Min. :11.28 Min. :0.0969 Min. :0.00267 Min. :0.01190
## 1st Qu.:14.53 1st Qu.:0.1814 1st Qu.:0.00334 1st Qu.:0.03893
## Median :18.05 Median :0.2118 Median :0.00420 Median :0.04670
## Mean :17.94 Mean :0.2180 Mean :0.00425 Mean :0.04549
## 3rd Qu.:20.82 3rd Qu.:0.2594 3rd Qu.:0.00512 3rd Qu.:0.05440
## Max. :25.59 Max. :0.3571 Max. :0.00587 Max. :0.06800
## NA's :34 NA's :41
## Fe Ni Cu Zn
## Min. :0.9617 Min. :0.000400 Min. :0.003600 Min. :0.00167
## 1st Qu.:2.2241 1st Qu.:0.002200 1st Qu.:0.005933 1st Qu.:0.00308
## Median :2.7214 Median :0.003000 Median :0.006500 Median :0.00337
## Mean :2.8091 Mean :0.002856 Mean :0.006761 Mean :0.00335
## 3rd Qu.:3.3530 3rd Qu.:0.003500 3rd Qu.:0.007400 3rd Qu.:0.00382
## Max. :4.7754 Max. :0.005900 Max. :0.011333 Max. :0.00500
## NA's :34
## As Rb Sr Y
## Min. :0.00033 Min. :0.00120 Min. :0.01790 Min. :0.00087
## 1st Qu.:0.00033 1st Qu.:0.00173 1st Qu.:0.02893 1st Qu.:0.00137
## Median :0.00040 Median :0.00203 Median :0.03403 Median :0.00160
## Mean :0.00040 Mean :0.00204 Mean :0.03402 Mean :0.00161
## 3rd Qu.:0.00047 3rd Qu.:0.00213 3rd Qu.:0.03835 3rd Qu.:0.00173
## Max. :0.00047 Max. :0.00357 Max. :0.04840 Max. :0.00260
## NA's :48 NA's :35 NA's :34 NA's :36
## Zr Nb Rh Ag
## Min. :0.004300 Min. :0.00063 Min. :0.01570 Min. :0.00230
## 1st Qu.:0.007800 1st Qu.:0.00063 1st Qu.:0.01683 1st Qu.:0.00282
## Median :0.008800 Median :0.00063 Median :0.01797 Median :0.00322
## Mean :0.008810 Mean :0.00063 Mean :0.01797 Mean :0.00330
## 3rd Qu.:0.009667 3rd Qu.:0.00063 3rd Qu.:0.01910 3rd Qu.:0.00369
## Max. :0.012200 Max. :0.00063 Max. :0.02023 Max. :0.00447
## NA's :52 NA's :51 NA's :49
## Pb
## Min. :0.002
## 1st Qu.:0.002
## Median :0.002
## Mean :0.002
## 3rd Qu.:0.002
## Max. :0.002
## NA's :52
MB_nona:
summary(MB_nona)
## Area Type Ti Mn
## LA54:4 :5 mudbrick :11 Min. :0.0969 Min. :0.01190
## LA59:2 :5 mudbrick (old): 0 1st Qu.:0.1663 1st Qu.:0.02397
## LA54:7 :1 soil : 0 Median :0.1915 Median :0.03347
## Hadjuabudllah l1:0 Mean :0.1780 Mean :0.03070
## Hadjuabudllah l2:0 3rd Qu.:0.1941 3rd Qu.:0.03895
## Laona soil l1 :0 Max. :0.2295 Max. :0.04177
## (Other) :0
## Fe Ni Cu Zr
## Min. :0.9617 Min. :0.001400 Min. :0.004500 Min. :0.005400
## 1st Qu.:1.6270 1st Qu.:0.002317 1st Qu.:0.005533 1st Qu.:0.007767
## Median :2.1312 Median :0.003000 Median :0.006700 Median :0.008733
## Mean :1.8912 Mean :0.002812 Mean :0.007333 Mean :0.008494
## 3rd Qu.:2.2622 3rd Qu.:0.003367 3rd Qu.:0.008717 3rd Qu.:0.009100
## Max. :2.4481 Max. :0.003967 Max. :0.011333 Max. :0.011833
##
MB_all:
summary(MB_all)
## Area Type MgO Al2O3
## LA54:4 :5 mudbrick :11 Min. :2.919 Min. :2.182
## LA59:2 :5 mudbrick (old): 0 1st Qu.:3.531 1st Qu.:3.403
## LA54:7 :1 soil : 0 Median :4.088 Median :4.317
## Hadjuabudllah l1:0 Mean :3.999 Mean :3.941
## Hadjuabudllah l2:0 3rd Qu.:4.400 3rd Qu.:4.593
## Laona soil l1 :0 Max. :5.125 Max. :5.270
## (Other) :0
## SiO2 S Cl K2O
## Min. : 9.164 Min. :0.1622 Min. :0.02723 Min. :0.1980
## 1st Qu.:13.366 1st Qu.:0.2737 1st Qu.:0.06783 1st Qu.:0.3920
## Median :17.705 Median :0.3221 Median :0.09397 Median :0.4376
## Mean :15.874 Mean :0.3550 Mean :0.20759 Mean :0.4312
## 3rd Qu.:18.560 3rd Qu.:0.3809 3rd Qu.:0.23782 3rd Qu.:0.4757
## Max. :20.334 Max. :0.7460 Max. :0.93440 Max. :0.6052
## NA's :1
## CaO Ti V Mn
## Min. :11.28 Min. :0.0969 Min. :0.002667 Min. :0.01190
## 1st Qu.:18.46 1st Qu.:0.1663 1st Qu.:0.004317 1st Qu.:0.02397
## Median :20.74 Median :0.1915 Median :0.004750 Median :0.03347
## Mean :19.67 Mean :0.1780 Mean :0.004721 Mean :0.03070
## 3rd Qu.:22.12 3rd Qu.:0.1941 3rd Qu.:0.005633 3rd Qu.:0.03895
## Max. :25.59 Max. :0.2295 Max. :0.005867 Max. :0.04177
## NA's :3
## Fe Ni Cu Zn
## Min. :0.9617 Min. :0.001400 Min. :0.004500 Min. :0.001667
## 1st Qu.:1.6270 1st Qu.:0.002317 1st Qu.:0.005533 1st Qu.:0.002467
## Median :2.1312 Median :0.003000 Median :0.006700 Median :0.003233
## Mean :1.8912 Mean :0.002812 Mean :0.007333 Mean :0.003170
## 3rd Qu.:2.2622 3rd Qu.:0.003367 3rd Qu.:0.008717 3rd Qu.:0.003800
## Max. :2.4481 Max. :0.003967 Max. :0.011333 Max. :0.005000
##
## Rb Sr Y Zr
## Min. :0.001200 Min. :0.01790 Min. :0.0008667 Min. :0.005400
## 1st Qu.:0.001617 1st Qu.:0.02893 1st Qu.:0.0013333 1st Qu.:0.007767
## Median :0.001767 Median :0.03507 Median :0.0014000 Median :0.008733
## Mean :0.001750 Mean :0.03438 Mean :0.0013741 Mean :0.008494
## 3rd Qu.:0.001925 3rd Qu.:0.03835 3rd Qu.:0.0014667 3rd Qu.:0.009100
## Max. :0.002133 Max. :0.04840 Max. :0.0017333 Max. :0.011833
## NA's :1 NA's :2
Only the “new” mudbrick samples
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(MB_nona[3:8], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(MB_nona[3:8], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(MB_nona[3:8], mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements:
# PCA for only non-NA elements
colnames(MB_nona)
## [1] "Area" "Type" "Ti" "Mn" "Fe" "Ni" "Cu" "Zr"
pca_1 <- prcomp(MB_nona[3:8], scale = TRUE, center = TRUE)
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0136 1.0789 0.70395 0.45806 0.25323 0.10779
## Proportion of Variance 0.6758 0.1940 0.08259 0.03497 0.01069 0.00194
## Cumulative Proportion 0.6758 0.8698 0.95241 0.98738 0.99806 1.00000
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_1, data=MB_nona, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos 1: no NA:s at all")
PCA(MB_nona[3:8])
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 6 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
PCA with all the feasible elements:
# NA:s converted to zeros
MB_all[is.na(MB_all)] <- 0
colnames(MB_all)
## [1] "Area" "Type" "MgO" "Al2O3" "SiO2" "S" "Cl" "K2O" "CaO"
## [10] "Ti" "V" "Mn" "Fe" "Ni" "Cu" "Zn" "Rb" "Sr"
## [19] "Y" "Zr"
pca_2 <- prcomp((MB_all[3:20]), scale = TRUE, center = TRUE)
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.1620 1.8251 1.26077 1.10902 0.8950 0.63845 0.5612
## Proportion of Variance 0.5554 0.1851 0.08831 0.06833 0.0445 0.02265 0.0175
## Cumulative Proportion 0.5554 0.7405 0.82882 0.89715 0.9416 0.96429 0.9818
## PC8 PC9 PC10 PC11
## Standard deviation 0.4388 0.34174 0.13583 3.155e-16
## Proportion of Variance 0.0107 0.00649 0.00103 0.000e+00
## Cumulative Proportion 0.9925 0.99897 1.00000 1.000e+00
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_2, data=MB_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos 2: NA:s permitted")
PCA(MB_all[3:20])
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 18 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
Including the previously published ones, No NA values permitted
# PCA for only non-NA elements
colnames(pxrf_no_na)
## [1] "Area" "Type" "Ti" "Mn" "Fe" "Ni" "Cu" "Zr"
pca_3 <- prcomp(na.omit(pxrf_no_na[3:6]), scale = TRUE, center = TRUE)
summary(pca_3)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.6514 0.9995 0.41614 0.31724
## Proportion of Variance 0.6818 0.2498 0.04329 0.02516
## Cumulative Proportion 0.6818 0.9315 0.97484 1.00000
biplot(pca_3, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_3, data=pxrf_no_na, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by area")
autoplot(pca_3, data=pxrf_no_na, colour='Type', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by sample type")
Including the previously published ones, NA values are permitted This is not viable, as the difference in what values are available is too different between old and new samples.
# NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
colnames(pxrf_all)
## [1] "Area" "Type" "MgO" "Al2O3" "SiO2" "S" "Cl" "K2O" "CaO"
## [10] "Ti" "V" "Mn" "Fe" "Ni" "Cu" "Zn" "As" "Rb"
## [19] "Sr" "Y" "Zr" "Nb" "Rh" "Ag" "Pb"
pca_4 <- prcomp(pxrf_all[3:25], scale = TRUE, center = TRUE)
summary(pca_4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.3748 1.8765 1.50629 1.13013 1.04455 0.89816 0.83735
## Proportion of Variance 0.4952 0.1531 0.09865 0.05553 0.04744 0.03507 0.03049
## Cumulative Proportion 0.4952 0.6483 0.74692 0.80245 0.84989 0.88497 0.91545
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.77794 0.66209 0.53427 0.40495 0.35837 0.33354 0.27679
## Proportion of Variance 0.02631 0.01906 0.01241 0.00713 0.00558 0.00484 0.00333
## Cumulative Proportion 0.94176 0.96082 0.97323 0.98036 0.98595 0.99078 0.99412
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.22788 0.17584 0.14605 0.12032 0.10296 0.06388 0.04325
## Proportion of Variance 0.00226 0.00134 0.00093 0.00063 0.00046 0.00018 0.00008
## Cumulative Proportion 0.99637 0.99772 0.99865 0.99927 0.99974 0.99991 0.99999
## PC22 PC23
## Standard deviation 0.01157 1.468e-16
## Proportion of Variance 0.00001 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
biplot(pca_4, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
autoplot(pca_4, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by area")
autoplot(pca_4, data=pxrf_all, colour='Type', shape = FALSE, label = TRUE, main = "PCA Palaepaphos grouped by sample type")
HCA including only new mudbrick samples:
# HCA dendrogam 1
dend <-
MB_all[3:20] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=5) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=5) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2
HCA.ward5 <- hclust(dist(MB_all[3:20], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)
# Divisive hierarchical clustering (DIANA) just for comparison
hc4 <- diana(MB_all[3:20])
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.7894269
HCA including all mudbrick samples, only no NA:s elements :
# HCA dendrogam 1
dend <-
pxrf_no_na[1:45, 3:8] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=5) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=5) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2
HCA.ward5 <- hclust(dist(pxrf_no_na[1:45, 3:8], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)
# Divisive hierarchical clustering (DIANA) just for comparison
hc4 <- diana(pxrf_no_na[3:8])
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.9804178
# Libraries
library(dplyr);
library(openxlsx);
library(ggtern)
# Read and filter data
grain <- read.xlsx("data/grain_PP.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
grain <- grain %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average data by "Sample", "Context" and "Type" columns from the original dataset
grain <- aggregate(grain, by=list(grain$Sample, grain$Context, grain$Type), FUN=mean)
# Remove the now empty columns for clarity
grain <- grain %>% select(-Sample)
grain <- grain %>% select(-Context)
grain <- grain %>% select(-Type)
# Assign sample names ("Group.1") as row names, and rename the other created columns back to "Type" and "Area" for clarity
rownames(grain) <- grain$Group.1
grain <- grain %>% select(-Group.1)
names(grain)[names(grain) == "Group.2"] <- "Context"
names(grain)[names(grain) == "Group.3"] <- "Type"
# Scaling clay-silt-sand portions to values between 0 and 1
# (Otherwise the differences in low clay percentages are too small to differentiate)
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
# Apply function "normalize" to clay, silt and sand columns
grain_01 <- as.data.frame(apply(grain[4:6], 2, normalize))
# Creating subsets for only mudbricks
MB_grain <- grain_01[c(1:11),]
MB_context <- grain[c(1:11),]
# Ternary plots
ggtern(data=MB_grain, aes(x=Clay, y=Sand, z=Silt, color=MB_context$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(MB_grain)), size=3)
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(openxlsx);
library(dplyr);
library(ggplot2);
library(ggrepel);
library(tidyr);
loi <- read.xlsx("data/loi_PP.xlsx", sep=";")
tga <- read.xlsx("data/tga_PP.xlsx", sep=";")
summary(loi)
## sample crucible wet;weight sample;wet
## Length:27 Min. :8.100 Min. : 9.637 Min. :1.537
## Class :character 1st Qu.:8.639 1st Qu.:10.859 1st Qu.:2.193
## Mode :character Median :8.997 Median :11.184 Median :2.371
## Mean :8.988 Mean :11.274 Mean :2.286
## 3rd Qu.:9.454 3rd Qu.:11.844 3rd Qu.:2.453
## Max. :9.560 Max. :12.301 Max. :2.785
## dry_weight after_550_C after_950_C context
## Min. : 9.595 Min. : 9.484 Min. : 9.23 Length:27
## 1st Qu.:10.813 1st Qu.:10.721 1st Qu.:10.27 Class :character
## Median :11.132 Median :11.019 Median :10.59 Mode :character
## Mean :11.216 Mean :11.113 Mean :10.67
## 3rd Qu.:11.777 3rd Qu.:11.681 3rd Qu.:11.18
## Max. :12.218 Max. :12.076 Max. :11.56
## type
## Length:27
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:30 Length:30 Length:30 Length:30
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Dry Mass_550 Mass_950 context
## Length:30 Length:30 Length:30 Length:30
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## type
## Length:30
## Class :character
## Mode :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=loi)
boxplot(c950~context, data=loi)
# Removing duplicates from the next graph and creating MB only subset
loi <- subset(loi[1:19, ])
loi_MB <- subset(loi[9:19, ])
rownames(loi)
## [1] "PP-1" "PP-2" "PP-3" "PP-4" "PP-5" "PP-6" "PP-7" "PP-8" "PP-9"
## [10] "PP-10" "PP-11" "PP-12" "PP-13" "PP-14" "PP-15" "PP-16" "PP-17" "PP-18"
## [19] "PP-19"
rownames(loi_MB)
## [1] "PP-9" "PP-10" "PP-11" "PP-12" "PP-13" "PP-14" "PP-15" "PP-16" "PP-17"
## [10] "PP-18" "PP-19"
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=loi_MB)
boxplot(c950~context, data=loi_MB)
# Color by context
ggplot(loi,
aes(c550, c950, label = rownames(loi), colour = factor(context))) +
geom_point(size=2, aes(shape = factor(type))) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Color by type
ggplot(loi,
aes(c550, c950)) +
geom_point(size=2, aes(shape = factor(context), colour = factor(type))) +
labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# MB only
ggplot(loi_MB,
aes(c550, c950, label = rownames(loi_MB), colour = factor(context))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "MB only organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi_MB, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=tga)
boxplot(c950~context, data=tga)
# Removing duplicates and creating MB only subset
tga_MB <- subset(tga[9:20, ])
tga_MB <- subset(tga_MB[-4, ])
rownames(tga_MB)
## [1] "PP-9" "PP-10" "PP-11" "PP-12" "PP-13" "PP-14" "PP-15" "PP-16" "PP-17"
## [10] "PP-18" "PP-19"
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=tga_MB)
boxplot(c950~context, data=tga_MB)
# Color by context
ggplot(tga_MB,
aes(c550, c950, label = rownames(tga_MB), colour = factor(context))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga_MB, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_PP.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :62.38 Min. : 7.27 Min. :19.90
## 1st Qu.:63.47 1st Qu.: 8.50 1st Qu.:21.36
## Median :64.19 Median : 9.34 Median :21.72
## Mean :65.50 Mean : 9.29 Mean :21.92
## 3rd Qu.:65.78 3rd Qu.:10.56 3rd Qu.:22.96
## Max. :72.00 Max. :10.90 Max. :23.90
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("b")
# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L,
xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$a, color$b, color$L, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
scatter3D(color$L, color$a, color$b,
xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
Intro about the site
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
# Libraries
library(openxlsx);
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(cluster);
library(ggplot2);
library(ggrepel);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
# Read the raw data
pxrf <- read.xlsx("data/pxrf_AA.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2",
"K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
summary(pxrf_no_na)
## Area Type MgO Al2O3
## Hill II :3 mudbrick:10 Min. :-0.7957 Min. :-1.47906
## Phase I :2 1st Qu.:-0.4773 1st Qu.:-0.41015
## Phase II :2 Median :-0.2741 Median : 0.05631
## Phase II or III:2 Mean : 0.0000 Mean : 0.00000
## Urartian silo :1 3rd Qu.: 0.3559 3rd Qu.: 0.45276
## Max. : 1.9131 Max. : 1.38944
## SiO2 K2O CaO Ti
## Min. :-1.6654 Min. :-1.55669 Min. :-1.4333259 Min. :-1.87044
## 1st Qu.:-0.3827 1st Qu.:-0.68787 1st Qu.:-0.6680205 1st Qu.:-0.31750
## Median : 0.3100 Median : 0.03848 Median :-0.0001026 Median : 0.02437
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000000 Mean : 0.00000
## 3rd Qu.: 0.6156 3rd Qu.: 0.75515 3rd Qu.: 0.7641718 3rd Qu.: 0.81009
## Max. : 0.8556 Max. : 1.11418 Max. : 1.2091486 Max. : 0.86883
## Mn Fe Ni Cu
## Min. :-1.03797 Min. :-0.73814 Min. :-0.94570 Min. :-1.2582
## 1st Qu.:-0.30897 1st Qu.:-0.28874 1st Qu.:-0.66424 1st Qu.:-0.5607
## Median :-0.09355 Median :-0.08696 Median :-0.07318 Median : 0.1094
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.21238 3rd Qu.: 0.08823 3rd Qu.: 0.45456 3rd Qu.: 0.3966
## Max. : 1.57654 Max. : 1.60860 Max. : 1.50299 Max. : 1.5865
## Zn Rb Sr Y
## Min. :-0.8847 Min. :-1.2036 Min. :-1.4059 Min. :-1.0149
## 1st Qu.:-0.3508 1st Qu.:-0.4637 1st Qu.:-0.3803 1st Qu.:-0.3542
## Median :-0.0839 Median :-0.2006 Median :-0.1555 Median :-0.1163
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.2720 3rd Qu.: 0.3256 3rd Qu.: 0.4915 3rd Qu.: 0.1216
## Max. : 1.5813 Max. : 1.4601 Max. : 1.4314 Max. : 1.7338
## Zr
## Min. :-1.1158
## 1st Qu.:-0.5166
## Median :-0.3001
## Mean : 0.0000
## 3rd Qu.: 0.1279
## Max. : 1.9054
pxrf_all:
summary(pxrf_all)
## Area Type MgO Al2O3
## Hill II :3 mudbrick:10 Min. :-0.7957 Min. :-1.47906
## Phase I :2 1st Qu.:-0.4773 1st Qu.:-0.41015
## Phase II :2 Median :-0.2741 Median : 0.05631
## Phase II or III:2 Mean : 0.0000 Mean : 0.00000
## Urartian silo :1 3rd Qu.: 0.3559 3rd Qu.: 0.45276
## Max. : 1.9131 Max. : 1.38944
##
## SiO2 K2O CaO Ti
## Min. :-1.6654 Min. :-1.55669 Min. :-1.4333259 Min. :-1.87044
## 1st Qu.:-0.3827 1st Qu.:-0.68787 1st Qu.:-0.6680205 1st Qu.:-0.31750
## Median : 0.3100 Median : 0.03848 Median :-0.0001026 Median : 0.02437
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000000 Mean : 0.00000
## 3rd Qu.: 0.6156 3rd Qu.: 0.75515 3rd Qu.: 0.7641718 3rd Qu.: 0.81009
## Max. : 0.8556 Max. : 1.11418 Max. : 1.2091486 Max. : 0.86883
##
## V Cr Mn Fe
## Min. :-0.46003 Min. :-0.686179 Min. :-1.03797 Min. :-0.73814
## 1st Qu.:-0.43566 1st Qu.:-0.459188 1st Qu.:-0.30897 1st Qu.:-0.28874
## Median :-0.19193 Median :-0.232198 Median :-0.09355 Median :-0.08696
## Mean :-0.09932 Mean :-0.232198 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.02742 3rd Qu.:-0.005208 3rd Qu.: 0.21238 3rd Qu.: 0.08823
## Max. : 0.56361 Max. : 0.221783 Max. : 1.57654 Max. : 1.60860
## NA's :5 NA's :8
## Ni Cu Zn As
## Min. :-0.94570 Min. :-1.2582 Min. :-0.8847 Min. :-1.03932
## 1st Qu.:-0.66424 1st Qu.:-0.5607 1st Qu.:-0.3508 1st Qu.:-0.21581
## Median :-0.07318 Median : 0.1094 Median :-0.0839 Median :-0.05111
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.05869
## 3rd Qu.: 0.45456 3rd Qu.: 0.3966 3rd Qu.: 0.2720 3rd Qu.: 0.44299
## Max. : 1.50299 Max. : 1.5865 Max. : 1.5813 Max. : 1.10179
## NA's :1
## Rb Sr Y Zr
## Min. :-1.2036 Min. :-1.4059 Min. :-1.0149 Min. :-1.1158
## 1st Qu.:-0.4637 1st Qu.:-0.3803 1st Qu.:-0.3542 1st Qu.:-0.5166
## Median :-0.2006 Median :-0.1555 Median :-0.1163 Median :-0.3001
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3256 3rd Qu.: 0.4915 3rd Qu.: 0.1216 3rd Qu.: 0.1279
## Max. : 1.4601 Max. : 1.4314 Max. : 1.7338 Max. : 1.9054
##
## Nb
## Min. :-0.65796
## 1st Qu.:-0.50862
## Median :-0.06060
## Mean :-0.09379
## 3rd Qu.:-0.06060
## Max. : 0.93500
## NA's :4
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements:
# PCA
pca_1 <- prcomp(pxrf_1[3:15])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4147 1.0889 0.9306 0.48309 0.37987 0.31662 0.24466
## Proportion of Variance 0.6901 0.1403 0.1025 0.02762 0.01708 0.01186 0.00708
## Cumulative Proportion 0.6901 0.8305 0.9330 0.96059 0.97767 0.98953 0.99662
## PC8 PC9 PC10
## Standard deviation 0.15630 0.06455 7.266e-17
## Proportion of Variance 0.00289 0.00049 0.000e+00
## Cumulative Proportion 0.99951 1.00000 1.000e+00
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA grouped by area")
PCA with all the feasible elements:
# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:21])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6421 1.2026 0.98838 0.84185 0.45012 0.39002 0.30457
## Proportion of Variance 0.6529 0.1353 0.09136 0.06628 0.01895 0.01423 0.00868
## Cumulative Proportion 0.6529 0.7881 0.87946 0.94574 0.96469 0.97891 0.98759
## PC8 PC9 PC10
## Standard deviation 0.28180 0.23084 2.238e-16
## Proportion of Variance 0.00743 0.00498 0.000e+00
## Cumulative Proportion 0.99502 1.00000 1.000e+00
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")
autoplot(pca_2, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA more elements grouped by area")
# HCA
# HCA dendrogam 1
dend <-
pxrf_1[3:12] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=4) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=4) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward4 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=4)
# Libraries
library(dplyr);
library(openxlsx);
library(ggtern)
# Read and filter data
grain <- read.xlsx("data/grain_AA.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
grain <- grain %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average data by "Sample", "Context" and "Type" columns from the original dataset
grain <- aggregate(grain, by=list(grain$Sample, grain$Context, grain$Type), FUN=mean)
# Remove the now empty columns for clarity
grain <- grain %>% select(-Sample)
grain <- grain %>% select(-Context)
grain <- grain %>% select(-Type)
# Assign sample names ("Group.1") as row names, and rename the other created columns back to "Type" and "Area" for clarity
rownames(grain) <- grain$Group.1
grain <- grain %>% select(-Group.1)
names(grain)[names(grain) == "Group.2"] <- "Context"
names(grain)[names(grain) == "Group.3"] <- "Type"
# Scaling clay-silt-sand portions to values between 0 and 1
# (Otherwise the differences in low clay percentages are too small to differentiate)
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
# Apply function "normalize" to clay, silt and sand columns
grain_01 <- as.data.frame(apply(grain[4:6], 2, normalize))
# Ternary plots
ggtern(data=grain_01, aes(x=Clay, y=Sand, z=Silt, color=grain$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain_01)), size=3)
We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.
# Libraries
library(openxlsx);
library(dplyr);
library(ggplot2);
library(ggrepel);
library(tidyr);
loi <- read.xlsx("data/loi_AA.xlsx", sep=";")
tga <- read.xlsx("data/tga_AA.xlsx", sep=";")
summary(loi)
## sample crucible wet_weight sample_wet
## Length:10 Min. :8.066 Min. :10.70 Min. :1.923
## Class :character 1st Qu.:8.461 1st Qu.:10.93 1st Qu.:2.326
## Mode :character Median :8.995 Median :11.48 Median :2.558
## Mean :8.892 Mean :11.39 Mean :2.496
## 3rd Qu.:9.174 3rd Qu.:11.75 3rd Qu.:2.701
## Max. :9.890 Max. :12.18 Max. :2.842
## dry_weight after_550_C after_950_C context
## Min. :10.58 Min. :10.47 Min. :10.31 Length:10
## 1st Qu.:10.85 1st Qu.:10.78 1st Qu.:10.60 Class :character
## Median :11.43 Median :11.33 Median :11.16 Mode :character
## Mean :11.32 Mean :11.22 Mean :11.07
## 3rd Qu.:11.69 3rd Qu.:11.58 3rd Qu.:11.42
## Max. :12.13 Max. :12.06 Max. :11.94
## type
## Length:10
## Class :character
## Mode :character
##
##
##
summary(tga)
## Name Analysis;Date Batch Wet
## Length:12 Length:12 Length:12 Length:12
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Dry Mass_550 Mass_950 context
## Length:12 Length:12 Length:12 Length:12
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## type
## Length:12
## Class :character
## Mode :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))
# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)
# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name
# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)
loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100
# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=loi)
boxplot(c950~context, data=loi)
# Color by context
ggplot(loi,
aes(c550, c950, label = rownames(loi), colour = factor(context))) +
geom_point(size=2, aes(shape = factor(type))) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)
ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Distribution of LOI in 550 C (1) and 950 C (2)
boxplot(c550~context, data=tga)
boxplot(c950~context, data=tga)
# Color by context
ggplot(tga,
aes(c550, c950, label = rownames(tga), colour = factor(context))) +
geom_point(size=2) +
geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
xlab('Organic LOI (%)') +
ylab('Carbonate LOI (%)') +
theme(axis.title = element_text())
# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)
ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
geom_bar(stat = 'identity', position = 'stack') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Libraries
library(dplyr);
library(ggplot2);
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)
color1 <- read.table("data/color_AA.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
## L a b
## Min. :51.37 Min. :3.880 Min. :15.92
## 1st Qu.:57.73 1st Qu.:4.402 1st Qu.:16.49
## Median :58.79 Median :4.555 Median :16.93
## Mean :58.18 Mean :5.067 Mean :17.32
## 3rd Qu.:59.58 3rd Qu.:4.695 3rd Qu.:17.32
## Max. :61.57 Max. :8.670 Max. :21.47
ggplot(color,
aes(a, b, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Color (a*b)") +
xlab("a D65") +
ylab("b D65")
ggplot(color1,
aes(X, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "Luminance") +
ylab("L* (D65)") +
xlab("Sample number")
ggplot(color,
aes(a, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "A versus luminance") +
ylab("L* (D65)") +
xlab("a")
ggplot(color,
aes(b, L, label = rownames(color))) +
geom_point(size=2) +
geom_text_repel(size=3) +
labs(title = "B versus luminance") +
ylab("L* (D65)") +
xlab("b")
# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L,
xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$a, color$b, color$L, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)
scatter3D(color$L, color$a, color$b,
xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
phi = 0, bty = "g", type = "h",
ticktype = "detailed", pch = 19, cex = 1.2)
text3D(color$L, color$a, color$b, labels = rownames(color),
add = TRUE, colkey = FALSE, cex = 0.7)